# Author: Stephen Situ
# This is a project that explores the different ways data can be cleaned and pre processed for machine learning. The first method
# is just sequentially cleaning the data manually one step at a time. This is good for infrequent use. The next method is to
# define a function and pass the dataframe into the function. The last and most preferred method is to create a pipeline that
# can utilize the scikit learn pipeline library. It sequentially applies a list of transformations and a final estimator.
# They must implement fit and transform methods. The final estimator only needs to implement fit.
# Import pandas and Numpy
import pandas as pd
import numpy as np
# Read CSV
df = pd.read_csv('cars_raw.csv')
# Head
df.head()
Year | Make | Model | Used/New | Price | ConsumerRating | ConsumerReviews | SellerType | SellerName | SellerRating | ... | InteriorColor | Drivetrain | MinMPG | MaxMPG | FuelType | Transmission | Engine | VIN | Stock# | Mileage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019 | Toyota | Sienna SE | Used | $39,998 | 4.6 | 45 | Dealer | CarMax Murrieta - Now offering Curbside Pickup... | 3.3 | ... | Black | Front-wheel Drive | 19 | 27 | Gasoline | 8-Speed Automatic | 3.5L V6 24V PDI DOHC | 5TDXZ3DC2KS015402 | 22998646 | 29403 |
1 | 2018 | Ford | F-150 Lariat | Used | $49,985 | 4.8 | 817 | Dealer | Giant Chevrolet | 4.8 | ... | Black | Four-wheel Drive | 19 | 24 | Gasoline | 10-Speed Automatic | 3.5L V6 24V PDI DOHC Twin Turbo | 1FTEW1EG2JFD44217 | 22418A | 32929 |
2 | 2017 | RAM | 1500 Laramie | Used | $41,860 | 4.7 | 495 | Dealer | Gill Auto Group Madera | 4.6 | ... | Black | Four-wheel Drive | 15 | 21 | Gasoline | 8-Speed Automatic | 5.7L V8 16V MPFI OHV | 1C6RR7VT5HS842283 | NG277871G | 23173 |
3 | 2021 | Honda | Accord Sport SE | Used | $28,500 | 5.0 | 36 | Dealer | AutoSavvy Las Vegas | 4.6 | ... | – | Front-wheel Drive | 29 | 35 | Gasoline | Automatic CVT | 1.5L I4 16V GDI DOHC Turbo | 1HGCV1F49MA038035 | 54237 | 10598 |
4 | 2020 | Lexus | RX 350 | Used | $49,000 | 4.8 | 76 | Dealer | Lexus of Henderson | 4.8 | ... | Birch | Front-wheel Drive | 20 | 27 | Gasoline | 8-Speed Automatic | 3.5L V6 24V PDI DOHC | 2T2AZMAA8LC156270 | HDT4181A | 28137 |
5 rows × 32 columns
# Describe
df.describe()
Year | ConsumerRating | ConsumerReviews | SellerRating | SellerReviews | ComfortRating | InteriorDesignRating | PerformanceRating | ValueForMoneyRating | ExteriorStylingRating | ReliabilityRating | MinMPG | MaxMPG | Mileage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 | 9379.000000 |
mean | 2018.721719 | 4.702825 | 133.187014 | 4.412571 | 984.089988 | 4.771895 | 4.727391 | 4.696290 | 4.537083 | 4.782194 | 4.681746 | 22.755411 | 29.216548 | 37463.023350 |
std | 2.221708 | 0.240795 | 154.985640 | 0.626258 | 1609.039864 | 0.217822 | 0.194391 | 0.253664 | 0.338098 | 0.171537 | 0.368161 | 14.812869 | 12.809783 | 24970.342569 |
min | 2001.000000 | 2.500000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 | 2.800000 | 1.000000 | 1.000000 | 3.000000 | 1.000000 | 0.000000 | 0.000000 | 121.000000 |
25% | 2018.000000 | 4.700000 | 30.000000 | 4.300000 | 112.000000 | 4.700000 | 4.700000 | 4.600000 | 4.500000 | 4.700000 | 4.600000 | 18.000000 | 25.000000 | 18666.500000 |
50% | 2019.000000 | 4.800000 | 75.000000 | 4.600000 | 542.000000 | 4.800000 | 4.800000 | 4.700000 | 4.600000 | 4.800000 | 4.800000 | 20.000000 | 27.000000 | 32907.000000 |
75% | 2020.000000 | 4.800000 | 182.000000 | 4.800000 | 1272.000000 | 4.900000 | 4.800000 | 4.800000 | 4.700000 | 4.900000 | 4.900000 | 24.000000 | 31.000000 | 47698.000000 |
max | 2022.000000 | 5.000000 | 817.000000 | 5.000000 | 27824.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 | 150.000000 | 133.000000 | 234114.000000 |
# data types
df.dtypes
Year int64 Make object Model object Used/New object Price object ConsumerRating float64 ConsumerReviews int64 SellerType object SellerName object SellerRating float64 SellerReviews int64 StreetName object State object Zipcode object DealType object ComfortRating float64 InteriorDesignRating float64 PerformanceRating float64 ValueForMoneyRating float64 ExteriorStylingRating float64 ReliabilityRating float64 ExteriorColor object InteriorColor object Drivetrain object MinMPG int64 MaxMPG int64 FuelType object Transmission object Engine object VIN object Stock# object Mileage int64 dtype: object
# drop columns
df = df.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
# Head
df.head()
Year | Make | Price | ConsumerRating | ConsumerReviews | SellerType | SellerRating | SellerReviews | State | ComfortRating | InteriorDesignRating | PerformanceRating | ValueForMoneyRating | ExteriorStylingRating | ReliabilityRating | MinMPG | MaxMPG | Mileage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019 | Toyota | $39,998 | 4.6 | 45 | Dealer | 3.3 | 3 | CA | 4.7 | 4.6 | 4.6 | 4.4 | 4.6 | 4.7 | 19 | 27 | 29403 |
1 | 2018 | Ford | $49,985 | 4.8 | 817 | Dealer | 4.8 | 131 | CA | 4.9 | 4.8 | 4.8 | 4.6 | 4.8 | 4.7 | 19 | 24 | 32929 |
2 | 2017 | RAM | $41,860 | 4.7 | 495 | Dealer | 4.6 | 249 | CA | 4.8 | 4.7 | 4.8 | 4.6 | 4.8 | 4.7 | 15 | 21 | 23173 |
3 | 2021 | Honda | $28,500 | 5.0 | 36 | Dealer | 4.6 | 284 | NV | 4.9 | 5.0 | 4.9 | 5.0 | 5.0 | 5.0 | 29 | 35 | 10598 |
4 | 2020 | Lexus | $49,000 | 4.8 | 76 | Dealer | 4.8 | 4755 | NV | 4.9 | 4.8 | 4.8 | 4.7 | 4.8 | 4.9 | 20 | 27 | 28137 |
# delete $ from string "Price"
# delete "Not Priced"
df["Price"] = df["Price"].str.replace("$","")
df["Price"] = df["Price"].str.replace(",","")
df["Price"] = df["Price"].str.replace("Not Priced","")
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\3117742891.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. df["Price"] = df["Price"].str.replace("$","")
# Cast Price to numeric
df["Price"] = pd.to_numeric(df["Price"])
# Impute price as mean
df['Price'].fillna((df['Price'].mean()), inplace=True)
# Hot encode Categirucal features
df = pd.get_dummies(df)
df.to_csv("clean_normal.csv",index=False)
# Head
df.head
<bound method NDFrame.head of Year Price ConsumerRating ConsumerReviews SellerRating \ 0 2019 39998.0 4.6 45 3.3 1 2018 49985.0 4.8 817 4.8 2 2017 41860.0 4.7 495 4.6 3 2021 28500.0 5.0 36 4.6 4 2020 49000.0 4.8 76 4.8 ... ... ... ... ... ... 9374 2019 27374.0 4.7 205 4.4 9375 2019 61998.0 4.8 27 4.8 9376 2017 26944.0 4.8 137 4.7 9377 2019 28568.0 4.7 279 4.4 9378 2019 32091.0 4.8 204 4.4 SellerReviews ComfortRating InteriorDesignRating PerformanceRating \ 0 3 4.7 4.6 4.6 1 131 4.9 4.8 4.8 2 249 4.8 4.7 4.8 3 284 4.9 5.0 4.9 4 4755 4.9 4.8 4.8 ... ... ... ... ... 9374 443 4.7 4.7 4.6 9375 1789 4.9 4.8 4.8 9376 831 4.9 4.8 4.7 9377 680 4.8 4.7 4.6 9378 1105 4.9 4.9 4.6 ValueForMoneyRating ... State_TX State_US-12 State_US-169 State_UT \ 0 4.4 ... 0 0 0 0 1 4.6 ... 0 0 0 0 2 4.6 ... 0 0 0 0 3 5.0 ... 0 0 0 0 4 4.7 ... 0 0 0 0 ... ... ... ... ... ... ... 9374 4.7 ... 0 0 0 0 9375 4.6 ... 0 0 0 0 9376 4.6 ... 0 0 0 0 9377 4.7 ... 0 0 0 0 9378 4.8 ... 0 0 0 0 State_VA State_VT State_WA State_WI State_WV State_WY 0 0 0 0 0 0 0 1 0 0 0 0 0 0 2 0 0 0 0 0 0 3 0 0 0 0 0 0 4 0 0 0 0 0 0 ... ... ... ... ... ... ... 9374 0 0 0 0 0 0 9375 0 0 0 0 0 0 9376 0 0 0 0 0 0 9377 0 0 0 0 0 0 9378 0 0 0 0 0 0 [9379 rows x 117 columns]>
# Create a function to preform the cleaning and transformation task and test it
def clean(X):
X = X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
X["Price"] = X["Price"].str.replace("$","")
X["Price"] = X["Price"].str.replace(",","")
X["Price"] = X["Price"].str.replace("Not Priced","")
X["Price"] = pd.to_numeric(X["Price"])
X['Price'].fillna((X['Price'].mean()), inplace=True)
X = pd.get_dummies(X)
X.to_csv("clean_function.csv",index=False)
return(X)
# Should get the same result as before
df2 = pd.read_csv('cars_raw.csv')
clean(df2)
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\2668432355.py:4: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. X["Price"] = X["Price"].str.replace("$","")
Year | Price | ConsumerRating | ConsumerReviews | SellerRating | SellerReviews | ComfortRating | InteriorDesignRating | PerformanceRating | ValueForMoneyRating | ... | State_TX | State_US-12 | State_US-169 | State_UT | State_VA | State_VT | State_WA | State_WI | State_WV | State_WY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019 | 39998.0 | 4.6 | 45 | 3.3 | 3 | 4.7 | 4.6 | 4.6 | 4.4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2018 | 49985.0 | 4.8 | 817 | 4.8 | 131 | 4.9 | 4.8 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2017 | 41860.0 | 4.7 | 495 | 4.6 | 249 | 4.8 | 4.7 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2021 | 28500.0 | 5.0 | 36 | 4.6 | 284 | 4.9 | 5.0 | 4.9 | 5.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2020 | 49000.0 | 4.8 | 76 | 4.8 | 4755 | 4.9 | 4.8 | 4.8 | 4.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9374 | 2019 | 27374.0 | 4.7 | 205 | 4.4 | 443 | 4.7 | 4.7 | 4.6 | 4.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9375 | 2019 | 61998.0 | 4.8 | 27 | 4.8 | 1789 | 4.9 | 4.8 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9376 | 2017 | 26944.0 | 4.8 | 137 | 4.7 | 831 | 4.9 | 4.8 | 4.7 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9377 | 2019 | 28568.0 | 4.7 | 279 | 4.4 | 680 | 4.8 | 4.7 | 4.6 | 4.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9378 | 2019 | 32091.0 | 4.8 | 204 | 4.4 | 1105 | 4.9 | 4.9 | 4.6 | 4.8 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9379 rows × 117 columns
# Using sci-kit learn we can create a pipeline that uses estimators and transformers
from sklearn.base import BaseEstimator, TransformerMixin
class Columndropper(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self, X):
return X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
class PriceCleaner(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self, X):
X["Price"] = X["Price"].str.replace("$","")
X["Price"] = X["Price"].str.replace(",","")
X["Price"] = X["Price"].str.replace("Not Priced","")
return X
class CastNumeric(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self, X):
X["Price"] = pd.to_numeric(X["Price"])
return X
class Imputer(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self, X):
X['Price'].fillna((X['Price'].mean()), inplace=True)
return X
class HotEncode(BaseEstimator, TransformerMixin):
def fit(self,X,y=None):
return self
def transform(self, X):
X = pd.get_dummies(X)
return X
# Use a new dataframe, df3
df3 = pd.read_csv('cars_raw.csv')
# Call the functions
# now we can use classes/methods/attributes to do the transformation
column_drop = Columndropper()
price_cleaner = PriceCleaner()
cast_numeric = CastNumeric()
imputer = Imputer()
hot_encode = HotEncode()
df_clean = hot_encode.transform(imputer.transform(cast_numeric.transform(price_cleaner.transform(column_drop.transform(df3)))))
df_clean.head()
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. X["Price"] = X["Price"].str.replace("$","")
Year | Price | ConsumerRating | ConsumerReviews | SellerRating | SellerReviews | ComfortRating | InteriorDesignRating | PerformanceRating | ValueForMoneyRating | ... | State_TX | State_US-12 | State_US-169 | State_UT | State_VA | State_VT | State_WA | State_WI | State_WV | State_WY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019 | 39998.0 | 4.6 | 45 | 3.3 | 3 | 4.7 | 4.6 | 4.6 | 4.4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2018 | 49985.0 | 4.8 | 817 | 4.8 | 131 | 4.9 | 4.8 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2017 | 41860.0 | 4.7 | 495 | 4.6 | 249 | 4.8 | 4.7 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2021 | 28500.0 | 5.0 | 36 | 4.6 | 284 | 4.9 | 5.0 | 4.9 | 5.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2020 | 49000.0 | 4.8 | 76 | 4.8 | 4755 | 4.9 | 4.8 | 4.8 | 4.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 117 columns
df4 = pd.read_csv('cars_raw.csv')
# Define pipe as a list of tuples
from sklearn.pipeline import Pipeline
pipe = Pipeline([
("column_drop", Columndropper()),
("price_cleaner", PriceCleaner()),
("cast_numeric", CastNumeric()),
("imputer",Imputer()),
("hot_encode",HotEncode())
])
# Use fit_transform and get the same result
df_pipe_cleaned = pipe.fit_transform(df4)
df_pipe_cleaned.to_csv("clean_pipeline.csv",index=False)
df_pipe_cleaned.head()
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. X["Price"] = X["Price"].str.replace("$","")
Year | Price | ConsumerRating | ConsumerReviews | SellerRating | SellerReviews | ComfortRating | InteriorDesignRating | PerformanceRating | ValueForMoneyRating | ... | State_TX | State_US-12 | State_US-169 | State_UT | State_VA | State_VT | State_WA | State_WI | State_WV | State_WY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019 | 39998.0 | 4.6 | 45 | 3.3 | 3 | 4.7 | 4.6 | 4.6 | 4.4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 2018 | 49985.0 | 4.8 | 817 | 4.8 | 131 | 4.9 | 4.8 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 2017 | 41860.0 | 4.7 | 495 | 4.6 | 249 | 4.8 | 4.7 | 4.8 | 4.6 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2021 | 28500.0 | 5.0 | 36 | 4.6 | 284 | 4.9 | 5.0 | 4.9 | 5.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 2020 | 49000.0 | 4.8 | 76 | 4.8 | 4755 | 4.9 | 4.8 | 4.8 | 4.7 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 117 columns